Why was this show chosen? Due to all artists being established, they have a fanbase, which could play into the results. On survival shows where aspiring trainees are trying to debut, many trainees are unknown which means that they don’t have any influence from before the show starting.
The format of the show also makes it better than other shows for analysis, due to the outcomes of the contestants being evenly divided, with 7 people eliminated or debuting during each phase. The question this analysis asks how does the various factors of a group/artist affect their placement on Queendom Puzzle?
library(ggplot2)
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.9
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(kknn)
QP_data <- read_excel("QP-Ranking.xlsx")
QP_Ranking <- as_tibble(QP_data)
head(QP_Ranking)
## # A tibble: 6 × 12
## episode initial_rank name ranking signal_song initial_group remix_song
## <dbl> <dbl> <chr> <dbl> <chr> <chr> <chr>
## 1 1 4 Nana 22 PICK on the top Woo!ah! Nxde
## 2 1 1 Yeoreum 1 PICK-CAT WJSN Don't Call…
## 3 1 4 Hwiseo 22 Athena H1-KEY Shut Down
## 4 1 1 Kei 1 Athena Lovelyz Only One
## 5 1 2 Yeeun 8 PICK-CAT CLC Don't Call…
## 6 1 2 Jihan 8 PICK on the top Weeekly Nxde
## # ℹ 5 more variables: all_rounder_team <chr>, vocal_rap_song <chr>,
## # dance_song <chr>, semi_final_song <chr>, finale_song <chr>
QP_Ranking_EL7ZUP <- QP_Ranking %>%
filter(name == "Hwiseo" | name == "Nana" | name == "Yuki" | name == "Kei" | name == "Yeoreum" | name == "Yeonhee" | name == "Yeeun")
gfg_plot <- ggplot(QP_Ranking_EL7ZUP, aes(x=episode, y=ranking, group=name, color=name)) +
geom_line() +
geom_point() +
xlab("Episode Number") +
ylab("Rank") +
scale_x_continuous(breaks=1:10) +
scale_y_continuous(trans="reverse", breaks=1:28) +
facet_wrap(~name) +
scale_colour_discrete(name="Name")
gfg_plot
gfg_plot2 <- ggplot(QP_Ranking, aes(x=episode, y=ranking, group=name, color=name)) +
geom_line() +
geom_point() +
xlab("Episode Number") +
ylab("Rank") +
scale_x_continuous(breaks=1:10) +
scale_y_continuous(trans="reverse", breaks=1:28) +
facet_wrap(~name) +
scale_colour_discrete(name="Name")
gfg_plot2
gfg_plot3 <- ggplot(QP_Ranking, aes(x=episode, y=ranking, group=name, color=signal_song)) +
geom_line() +
geom_point() +
xlab("Episode Number") +
ylab("Rank") +
scale_x_continuous(breaks=1:10) +
scale_y_continuous(trans="reverse", breaks=1:28) +
facet_wrap(~signal_song) +
scale_colour_discrete(name="Signal Song")
gfg_plot3
gfg_plot4 <- ggplot(QP_Ranking, aes(x=episode, y=ranking, group=name, color=initial_group)) +
geom_line() +
geom_point() +
xlab("Episode Number") +
ylab("Rank") +
scale_x_continuous(breaks=1:10) +
scale_y_continuous(trans="reverse", breaks=1:28) +
facet_wrap(~initial_group) +
scale_colour_discrete(name="Affiliation")
gfg_plot4
QP_Ranking_Remix <- filter(QP_Ranking, remix_song != 'NA')
gfg_plot5 <- ggplot(QP_Ranking_Remix, aes(x=episode, y=ranking, group=name, color=remix_song)) +
geom_line() +
geom_point() +
xlab("Episode Number") +
ylab("Rank") +
scale_x_continuous(breaks=1:10) +
scale_y_continuous(trans="reverse", breaks=1:28) +
facet_wrap(~remix_song) +
scale_colour_discrete(name="Remix")
gfg_plot5
To analyze popularity, I will use the ranks given on the first episode of the show that were based on their prior career experience in the industry. This is a fair metric of popularity to compare, except for the fact that Fye and Miru are ranked low as they have no K-pop experience, despite having a fanbase outside of it which could influence the votes.
set.seed(377)
summarized_data <- QP_Ranking %>%
group_by(name) %>%
filter(episode != 1)
summarized_data$initial_rank <- as.factor(summarized_data$initial_rank)
summarized_split <- initial_split(summarized_data, prop = 0.75, strata = initial_rank)
summarized_training <- training(summarized_split)
summarized_testing <- testing(summarized_split)
training_graph <- ggplot(summarized_training, aes(x = episode, y = ranking, color = initial_rank)) +
geom_point() +
geom_jitter() +
labs(x = "Episode", y = "Ranking",
color = "Initial Ranking", title = "Ranking vs Episode") +
scale_x_continuous(breaks=1:10) +
scale_y_continuous(trans="reverse", breaks=1:28) +
theme(text = element_text(size = 15))
training_graph
Note: the graph uses geom_jitter, all ranks and episodes are whole numbers so any point that appears to be off-center is a value representing the closest rank value and episode value.
ranking_info <- summarized_training %>%
group_by(initial_rank) %>%
summarize(mean_rank = mean(ranking))
set.seed(377)
rank_recipe <- recipe(initial_rank ~ episode + ranking, data = summarized_training) %>%
step_scale(all_predictors()) %>%
step_center(all_predictors())
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 4) %>%
set_engine("kknn") %>%
set_mode("classification")
rank_vfold <- vfold_cv(summarized_training, v = 5, strata = initial_rank)
k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))
knn_flow <- workflow() %>%
add_recipe(rank_recipe) %>%
add_model(knn_spec) %>%
fit(data = summarized_training)
mutated_QP_Ranking <- QP_Ranking |>
mutate(classification = as_factor(initial_rank)) |>
select(classification, episode, ranking)
ep_grid <- seq(min(mutated_QP_Ranking$episode),
max(mutated_QP_Ranking$episode),
length.out = 100)
rank_grid <- seq(min(mutated_QP_Ranking$ranking),
max(mutated_QP_Ranking$ranking),
length.out = 100)
asgrid <- as_tibble(expand.grid(episode = ep_grid,
ranking = rank_grid))
knnPredGrid <- predict(knn_flow, asgrid)
prediction_table <- bind_cols(knnPredGrid, asgrid) %>%
rename(classification = .pred_class)
prediction_table
## # A tibble: 10,000 × 3
## classification episode ranking
## <fct> <dbl> <dbl>
## 1 1 1 1
## 2 1 1.09 1
## 3 1 1.18 1
## 4 1 1.27 1
## 5 1 1.36 1
## 6 1 1.45 1
## 7 1 1.55 1
## 8 1 1.64 1
## 9 1 1.73 1
## 10 1 1.82 1
## # ℹ 9,990 more rows
wkflw_plot <- ggplot() +
geom_point(data = mutated_QP_Ranking,
mapping = aes(x = episode,
y = ranking,
color = classification),
alpha = 0.75) +
geom_point(data = prediction_table,
mapping = aes(x = episode,
y = ranking,
color = classification),
alpha = 0.02,
size = 5) +
labs(color = "Initial Rank",
x = "Episode",
y = "Ranking") +
scale_color_manual(values = c("brown2", "goldenrod1", "dodgerblue", "mediumorchid1")) +
scale_x_continuous(breaks=1:10) +
scale_y_continuous(trans="reverse", breaks=1:28) +
theme(text = element_text(size = 12))
wkflw_plot